set.seed(1)
#if (!requireNamespace("BiocManager", quietly = TRUE))   
#  install.packages("BiocManager",repos = "http://cran.us.r-project.org")  
#BiocManager::install("fgsea")
#BiocManager::install("limma")
#BiocManager::install("Biobase")
#install.packages("ggplot2")
#install.packages("igraph")
library(limma)
library(Biobase)
library(ggplot2)
library(igraph)
library(data.table)
library(fgsea)
library(readr)
library(biomaRt)
library(edgeR)

# Get portal data and use only gtex portal samples
gtex.portal_thyroid <- read.csv("/home/ubuntu/gtex.portal_thyroid.txt", sep="")

##############################
bonobo = data.frame(fread("/home/esaha/BONOBO/network/BonoboPanda_GTEx_thyroid.txt"))
genes = bonobo$V1
bonobo = bonobo[,match(colnames(gtex.portal_thyroid), colnames(bonobo))]
bonobo_indices =  unlist(lapply(strsplit(colnames(bonobo), split = ".", fixed = T), function(x){paste(paste(x, collapse = "-"), 1, sep = ".")}))

# Get gene data
GTEx_thyroid = get(load("/home/esaha/BONOBO/data/GTEx_thyroid/GTEx_thyroid_RSE.RData"))
fdata.GTEx = as.data.frame(GTEx_thyroid@rowRanges)

# Discard Y genes
### Get GenecIDs for Y genes 
Y_genes.GTEx = fdata.GTEx$gene_id[which(fdata.GTEx$seqnames == "chrY")]
Y_genes.GTEx = gsub("\\..*", "", Y_genes.GTEx)

bonobo = bonobo[-which(genes %in% Y_genes.GTEx),]
genes = genes[-which(genes %in% Y_genes.GTEx)]

gene_id = sub("\\..*", "", fdata.GTEx$gene_id)
gene_names = fdata.GTEx$gene_name[match(genes, gene_id)]

# Load pathway results from BONOBO and PANDA
bonobo_gsea = get(load("/home/esaha/BONOBO/plots/GTEx_thyroid/BONOBO_GTEx_Thyroid_GSEA_GOBP.RData"))
panda_gsea = get(load("/home/esaha/BONOBO/plots/GTEx_thyroid/PANDA_GTEx_Thyroid_GSEA_GOBP.RData"))

# Find which pathways have conflicting evidence from BONOBO vs PANDA
common_pathways = intersect(bonobo_gsea$pathway[which(bonobo_gsea$padj < 0.05)], panda_gsea$pathway[which(panda_gsea$padj < 0.05)])
consensus = sign(bonobo_gsea$NES[match(common_pathways, bonobo_gsea$pathway)])*sign(panda_gsea$NES[match(common_pathways, panda_gsea$pathway)])
conflicting_pathways = common_pathways[which(consensus == -1)]
joint_pval_conflicting_pathways = bonobo_gsea$padj[match(conflicting_pathways, bonobo_gsea$pathway)]*panda_gsea$padj[match(conflicting_pathways, panda_gsea$pathway)]
ranked_conflicting_pathways = conflicting_pathways[order(joint_pval_conflicting_pathways)]

# NES Table for Conflicting Pathways
conflicting_pathway_names = stringr::str_to_title(lapply(conflicting_pathways, function(x){paste(unlist(strsplit(x, split = "_"))[-1], collapse = " ")}))
bonobo_NES = round(bonobo_gsea$NES[match(conflicting_pathways, bonobo_gsea$pathway)],3)
panda_NES = round(panda_gsea$NES[match(conflicting_pathways, panda_gsea$pathway)],3)
conflict_table = cbind(conflicting_pathway_names, bonobo_NES, panda_NES)
colnames(conflict_table) = c("Pathway Names", "NES (individual network)", "NES (population network")
conflict_table

write.table(conflict_table, "/home/esaha/BONOBO/plots/GTEx_thyroid/GTEx_Thyroid_BONOBO_PANDA_conflictingPathways.txt", quote = F, sep = "\t")

# Compute pathway scores
pathways <- gmtPathways("/home/ubuntu/c5.go.bp.v2022.1.Hs.symbols.gmt")
pathway_score = data.frame(lapply(pathways[match(ranked_conflicting_pathways, names(pathways))], function(x){colMeans(bonobo[which(gene_names %in% x),])}))

# Get phenotypic data
phenotypes <- read.csv("~/BONOBO/data/GTEx_thyroid/thyroid_phenotypes.txt", sep="")
GTEx_phenotypes = as.data.frame(cbind(phenotypes$SEX, phenotypes$AGE, phenotypes$RACE, 
                                      phenotypes$BMI, phenotypes$TRISCH, 
                                      phenotypes$gtex.smrin, phenotypes$gtex.smnabtcht,
                                      phenotypes$MHSMKSTS))
rownames(GTEx_phenotypes) = rownames(phenotypes)
colnames(GTEx_phenotypes) = c("gender", "age", "race", "bmi", "ischemic_time", "rna_degrad", "batch", "smoking")
GTEx_phenotypes =  GTEx_phenotypes[match(bonobo_indices, rownames(GTEx_phenotypes)),]

# Define the covariates: gender, age, race etc
gender <- GTEx_phenotypes$gender
gender[which(gender == 1)] = "MALE"
gender[which(gender == 2)] = "FEMALE"
gender = factor(gender, levels = c("FEMALE", "MALE"))

age <- as.numeric(as.character(GTEx_phenotypes$age))
age[which(is.na(age))] <- mean(age,na.rm=TRUE)

# Categorize race 1, 4, 99 as single class "others", 2: "black", 3: "white"
race <- as.numeric(as.character(GTEx_phenotypes$race))
race[which(race != 2 & race != 3)] = "others"
race[which(race == 2)] = "black or african american"
race[which(race == 3)] = "white"
race <- as.factor(race)

bmi <- as.numeric(as.character(GTEx_phenotypes$bmi))
bmi[which(is.na(bmi))] <- mean(bmi,na.rm=TRUE)

ischemic_time = GTEx_phenotypes$ischemic_time
hour_minute = strsplit(ischemic_time, split=" ")
ischemic_timeH = sapply(hour_minute, function(x){as.numeric(as.character(x[1]))+as.numeric(as.character(x[1]))/60})

rna_degrad = as.numeric(as.character(GTEx_phenotypes$rna_degrad))
rna_degrad[which(is.na(rna_degrad))] <- mean(rna_degrad,na.rm=TRUE)

batch_effect = as.factor(GTEx_phenotypes$batch)

smoking_status = GTEx_phenotypes$smoking
smoking_status[which(is.na(smoking_status))] = "Unknown"
smoking_status = factor(smoking_status, levels = c("No", "Yes", "Unknown"))

# Find out why immune pathway is higher in male from PANDA but higher in female from BONOBO
anova(lm(ischemic_timeH~gender))
anova(lm(age~gender))
chisq.test(gender, smoking_status)
chisq.test(gender, race)

# ischemic time is the most significant confounder as male samples have significantly higher ischemic time
wilcox.test(ischemic_timeH[which(gender == "MALE")], ischemic_timeH[which(gender == "FEMALE")], alternative = "greater")

# Plot pathway score by sex
library(ggplot2)
pathway_name = paste(unlist(strsplit(ranked_conflicting_pathways[1], split = "_"))[-1], collapse = " ")
pathway_name = stringr::str_to_title(pathway_name)
fulldata = data.frame(pathway_score[,1])
colnames(fulldata) = "score"
fulldata$sex = gender
fulldata$smoking = smoking_status
fulldata$ischemic_time = ischemic_timeH
fulldata$ischemic_binary = rep("low", nrow(fulldata))
fulldata$ischemic_binary[which(fulldata$ischemic_time > median(fulldata$ischemic_time))] = "high"
fulldata = fulldata[-which(fulldata$smoking == "Unknown"),]
ggplot(fulldata, aes(x=sex, y=score, color=ischemic_binary)) +
  geom_boxplot(outlier.shape = NA) +
  scale_y_continuous(limits = quantile(fulldata$score, c(0.01, 0.99)))

# Plot composite pathway score (average of 12 mismatch pathway) by sex
library(ggplot2)
score = rowMeans(pathway_score)
fulldata = data.frame(score)
colnames(fulldata) = "score"
fulldata$sex = gender
fulldata$smoking = smoking_status
fulldata$ischemic_time = ischemic_timeH
fulldata$ischemic_binary = rep("low", nrow(fulldata))
fulldata$ischemic_binary[which(fulldata$ischemic_time > median(fulldata$ischemic_time))] = "high"
fulldata = fulldata[-which(fulldata$smoking == "Unknown"),]
ggplot(fulldata, aes(x=sex, y=score, fill=sex)) + geom_boxplot() +
  ggtitle("Average Pathway Targeting Score by Sex") + xlab("Ischemic time")
#geom_boxplot(outlier.shape = NA) + scale_y_continuous(limits = quantile(fulldata$score, c(0.05, 0.95)))
